1 Set up

knitr::opts_chunk$set(echo = TRUE)

# Set start time ----
startTime <- proc.time()

# Libraries ----

message("Loading libraries...")
## Loading libraries...
library(here) # where are we?
## here() starts at /Users/ben/github/dataknut/botanicals
library(data.table) # data frames with superpowers
library(ggplot2) # pretty plots
library(kableExtra) # pretty tables
library(lubridate) # date & time stuff
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following object is masked from 'package:here':
## 
##     here
## The following object is masked from 'package:base':
## 
##     date
library(plotly) # interactive plots
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(skimr) # descriptive & summary stats

# Parameters ----
dPath <- paste0(here::here(), "/data/")
dFile <- paste0(dPath, "ClimateData_1976_2019.csv")

# get data ----

message("Getting data from ", dFile)
## Getting data from /Users/ben/github/dataknut/botanicals/data/ClimateData_1976_2019.csv
weatherDT <- data.table::fread(dFile) # load data

2 Background

We want to load some UK weather data and have a look at it. We then want to see if we can:

  • find droughts. Droughts in the UK are periods of > 14 days where precipitation is lower than 0.2 (cm?);
  • ….

3 Have a look at the data

weatherDT$V1 <- NULL # what's the row number for?

# do some pre-processing ----
weatherDT[, rDate := lubridate::dmy(Date)] # make nice date
weatherDT[, year := lubridate::year(rDate)] # make nice year
weatherDT[, month := lubridate::month(rDate, label = TRUE, abbr = TRUE)] # make nice date
t <- head(weatherDT) # what have we got?

kableExtra::kable(t, caption = "First few rows of processed data") %>%
  kable_styling()
Table 3.1: First few rows of processed data
Date maxT minT prcpt rDate year month
1/01/76 8.0 1.1 17.3 1976-01-01 1976 Jan
2/01/76 9.6 0.3 41.0 1976-01-02 1976 Jan
3/01/76 10.3 1.3 26.6 1976-01-03 1976 Jan
4/01/76 10.6 1.4 5.3 1976-01-04 1976 Jan
5/01/76 9.2 -0.2 18.3 1976-01-05 1976 Jan
6/01/76 8.1 4.7 4.7 1976-01-06 1976 Jan
skimr::skim(weatherDT)
Table 3.1: Data summary
Name weatherDT
Number of rows 16063
Number of columns 7
_______________________
Column type frequency:
character 1
Date 1
factor 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Date 0 1 7 8 0 16063 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
rDate 0 1 1976-01-01 2019-12-31 1997-12-27 16063

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
month 0 1 TRUE 12 Jan: 1364, Mar: 1364, May: 1364, Jul: 1364

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
maxT 0 1 11.55 6.02 -6.5 7.1 11.4 15.9 32.70 ▁▆▇▃▁
minT 0 1 5.13 4.95 -14.0 1.4 5.2 9.0 25.46 ▁▃▇▃▁
prcpt 0 1 3.65 6.30 0.0 0.0 0.7 4.9 72.20 ▇▁▁▁▁
year 0 1 1997.49 12.70 1976.0 1986.0 1997.0 2008.0 2019.00 ▇▇▇▇▇

Just for fun let’s plot the data. You should be able to hover the mouse over the dots in 3.1 to investigate.

# just for fun
plotDT <- weatherDT[, .(meanPrcpt = mean(prcpt)), 
                    keyby = .(year, month)]

p <- ggplot2::ggplot(plotDT, aes(x = month, y = meanPrcpt, group = year, colour = year)) +
  geom_point() +
  scale_color_viridis_c() +
  labs(y = "Mean precipitation",
       caption = "Year treated as continuous for pretty colours")

plotly::ggplotly(p) # interactive plot

Figure 3.1: Mean precipitation per month by year

So that looks vaguely right.

4 Challenge 1: Find the droughts

Count the number of spans (per year?) of 15 or more days when the daily rainfall < 0.2. This is the UK definition of drought.

To do this we:

  • mark the first & last days in a sequence of low rainfall (< 0.2) days
  • get the time difference between them in days
  • find the ones where the difference is > 14
setkey(weatherDT,rDate) # ensures ordered by date
weatherDT[, droughtThresh := ifelse(prcpt < 0.2, 1, 0)] # to make life easier
# flag the start & ends of low precipitation periods
weatherDT[, droughtPeriod := ifelse(droughtThresh == 1 & 
                                      shift(droughtThresh == 0), # first of 1 -> n days with low
                                   "Start", 
                                   NA)] 
weatherDT[, droughtPeriod := ifelse(droughtThresh == 0 & 
                                      shift(droughtThresh == 1), # last of 1 -> n days with low
                                    "End", 
                                    droughtPeriod)] 

# now isolate the start & ends of low precipitation periods
lowPrcptPeriodsDT <- weatherDT[droughtPeriod == "Start" | 
                                 droughtPeriod == "End"]

lowPrcptPeriodsDT[, periodCount := ifelse(droughtPeriod == "Start",
                                          shift(rDate, type = "lead") - rDate, # n days between start & end (they are ordered)
                                          NA) # undefined between end & start (we don't care about periods between droughts for now)
                  ] 

# flag the droughts using the 14 day threshold
lowPrcptPeriodsDT[, drought := ifelse(periodCount > 14, 
                                      "Drought start", 
                                      "Not drought")]

plotDT <- lowPrcptPeriodsDT[, .(nDroughts = .N), # data.table magic
                            keyby = .(drought, year, start = month)] # add up number of droughts/no droughts in these periods of low precip

# save out the low precip periods
outF <- paste0(dPath, "lowPrecipSequences.csv")
data.table::fwrite(lowPrcptPeriodsDT, outF) # includes start and end of sequences
# save out just the flagged droughts
lowPrcptPeriodsDT[, drought := ifelse(droughtPeriod == "End" &
                                        shift(periodCount > 14),
                                      "Drought end",
                                      drought)
                  ] # flag drought end as drought too for data output
outF <- paste0(dPath, "lowPrecipSequencesDroughts.csv")
data.table::fwrite(lowPrcptPeriodsDT[drought %like% "Drought"], outF)

ggplot2::ggplot(lowPrcptPeriodsDT, aes(x = periodCount)) +
  geom_histogram() +
  labs(x = "Duration of low precipitation periods (sequences of days < 0.2 cm, entire period)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2364 rows containing non-finite values (stat_bin).

4.1 Counting droughts

Table ?? shows the number of low precipitation events (1 or more days with < 0.2) which were droughts (> 14 days in a row) by start month and year. Perhaps surprisingly 1976 has none.

kableExtra::kable(plotDT[drought == "Drought start"], caption = "Number of low precipitation periods that were droughts by year") %>%
  kable_styling()
Table 4.1: Number of low precipitation periods that were droughts by year
drought year start nDroughts
Drought start 1977 May 1
Drought start 1980 May 1
Drought start 1981 Aug 1
Drought start 1984 Apr 1
Drought start 1984 Aug 1
Drought start 1985 Oct 1
Drought start 1989 Nov 1
Drought start 1991 Aug 1
Drought start 1991 Nov 1
Drought start 1992 Jun 1
Drought start 1995 Apr 1
Drought start 1995 Jun 1
Drought start 1995 Jul 1
Drought start 1997 May 1
Drought start 2000 May 1
Drought start 2002 Sep 1
Drought start 2003 Mar 1
Drought start 2003 Apr 1
Drought start 2011 Apr 1
Drought start 2013 Jul 1
Drought start 2018 Jun 1
Drought start 2019 Apr 1

Just to make this clear, Figure ?? shows the pattern of number of droughts by start month and year.

p <- ggplot2::ggplot(plotDT[drought == "Drought start"], 
                aes(x = start, fill = nDroughts, y = year)) +
  geom_tile() +
  labs(caption = "Note: months without droughts (Jan/Feb & Dec) not shown")

p
Number of droughts by start month and year

Figure 4.1: Number of droughts by start month and year

If this chart in not what you expect then remember:

  • the plot is not showing the actual span of days, it is showing a count of how many drought events there were that started in a given month;
  • the method will not classify a sequence of 4 days ‘rain < 0.2 cm’ -> 1 day ‘rain > 0.2’ -> 11 days ‘rain < 0.2’ as drought. It is, literally just finding periods of > 14 consecurive days with ‘rain < 0.2’. It seems there weren’t any in 1976 despite it being a “drought” (see below);
  • meteorological drought (no rain) and hydrological drought (no ground water) are not the same thing…
  • our experience of drought and what plants experience as water stress may not correlate that closely with the meteorological definition of drought used here…

Of note:

4.2 Drought periods

We can also plot the drought spans themselves (which might have been more useful in any case :-) - see Figure 4.3.

droughtsDT <- lowPrcptPeriodsDT[drought %like% "Drought"]

ggplot2::ggplot(droughtsDT, aes(x = periodCount)) +
  geom_histogram() +
  labs(x = "Drought duration (days) - whole period")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).
Drought periods histogram

Figure 4.2: Drought periods histogram

t <- droughtsDT[, .(meanDuration = mean(periodCount, na.rm = TRUE),
                    minDuration = min(periodCount, na.rm = TRUE),
                    maxDuration = max(periodCount, na.rm = TRUE))]

kableExtra::kable(t, caption = "Duration stats (all years)") %>%
  kable_styling()
Table 4.2: Duration stats (all years)
meanDuration minDuration maxDuration
18.18182 15 21
droughtsDT[, startDate := rDate] 
droughtsDT[, endDate := shift(rDate, type = "lead")]

# so how do we plot that?
plotDT <- droughtsDT[drought == "Drought start"]
plotDT[, startDec := lubridate::date_decimal(2000 + (lubridate::decimal_date(startDate) - year))]
plotDT[, endDec := lubridate::date_decimal(2000 + (lubridate::decimal_date(endDate) - year))]

p <- ggplot2::ggplot(plotDT, aes(x = startDec, xend = endDec,
                                y = year, yend = year)) +
  geom_segment(colour = "red") +
  # theme(legend.position="bottom") +
  # guides(colour = guide_legend(title = "Year", nrow = 2)) +
  scale_y_continuous(breaks = c(1975, 1980, 1985,1990,1995,2000,2005,2010,2015,2020)) +
  labs(x = "Date") +
  scale_x_datetime(date_breaks = "1 month", date_labels = "%b %d")
p +
  geom_text(
    label= plotDT$year, 
    size = 3,
    nudge_y = 0.5,
    check_overlap = T
  )
Drought periods by year

Figure 4.3: Drought periods by year

Is that what we expect? The periods appear visually similar in length but this just reflects the tight distribution of durations (see Figure 4.2).

5 Runtime

t <- proc.time() - startTime

elapsed <- t[[3]]

Analysis completed in 9.699 seconds ( 0.16 minutes) using knitr in RStudio with R version 3.6.3 (2020-02-29) running on x86_64-apple-darwin15.6.0.

6 R environment

R packages used:

  • base R - for the basics (R Core Team 2016)
  • data.table - for fast (big) data handling (Dowle et al. 2015)
  • ggplot2 - for slick graphics (Wickham 2009)
  • here - for here (Müller 2017)
  • lubridate - date manipulation (Grolemund and Wickham 2011)
  • kableExtra - fancy tables (Zhu 2018)
  • knitr - to create this document (Xie 2016)
  • skimr - data summaries (Arino de la Rubia et al. 2017)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_NZ.UTF-8/en_NZ.UTF-8/en_NZ.UTF-8/C/en_NZ.UTF-8/en_NZ.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] skimr_2.1         plotly_4.9.2      lubridate_1.7.4   kableExtra_1.1.0 
## [5] ggplot2_3.3.0     data.table_1.12.8 here_0.1         
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.0.0  xfun_0.12         repr_1.1.0        purrr_0.3.3      
##  [5] colorspace_1.4-1  vctrs_0.2.3       htmltools_0.4.0   viridisLite_0.3.0
##  [9] yaml_2.2.1        base64enc_0.1-3   rlang_0.4.5       later_1.0.0      
## [13] pillar_1.4.3      glue_1.3.1        withr_2.1.2       lifecycle_0.2.0  
## [17] stringr_1.4.0     munsell_0.5.0     gtable_0.3.0      rvest_0.3.5      
## [21] htmlwidgets_1.5.1 evaluate_0.14     labeling_0.3      knitr_1.28       
## [25] fastmap_1.0.1     httpuv_1.5.2      crosstalk_1.0.0   highr_0.8        
## [29] Rcpp_1.0.3        xtable_1.8-4      readr_1.3.1       promises_1.1.0   
## [33] scales_1.1.0      backports_1.1.5   webshot_0.5.2     jsonlite_1.6.1   
## [37] mime_0.9          farver_2.0.3      hms_0.5.3         digest_0.6.25    
## [41] stringi_1.4.6     shiny_1.4.0       bookdown_0.18     dplyr_0.8.5      
## [45] grid_3.6.3        rprojroot_1.3-2   tools_3.6.3       magrittr_1.5     
## [49] lazyeval_0.2.2    tibble_2.1.3      crayon_1.3.4      tidyr_1.0.2      
## [53] pkgconfig_2.0.3   xml2_1.2.2        assertthat_0.2.1  rmarkdown_2.1    
## [57] httr_1.4.1        rstudioapi_0.11   R6_2.4.1          compiler_3.6.3

References

Arino de la Rubia, Eduardo, Hao Zhu, Shannon Ellis, Elin Waring, and Michael Quinn. 2017. Skimr: Skimr. https://github.com/ropenscilabs/skimr.

Dowle, M, A Srinivasan, T Short, S Lianoglou with contributions from R Saporta, and E Antonyan. 2015. Data.table: Extension of Data.frame. https://CRAN.R-project.org/package=data.table.

Grolemund, Garrett, and Hadley Wickham. 2011. “Dates and Times Made Easy with lubridate.” Journal of Statistical Software 40 (3): 1–25. http://www.jstatsoft.org/v40/i03/.

Müller, Kirill. 2017. Here: A Simpler Way to Find Your Files. https://CRAN.R-project.org/package=here.

R Core Team. 2016. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

Wickham, Hadley. 2009. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. http://ggplot2.org.

Xie, Yihui. 2016. Knitr: A General-Purpose Package for Dynamic Report Generation in R. https://CRAN.R-project.org/package=knitr.

Zhu, Hao. 2018. KableExtra: Construct Complex Table with ’Kable’ and Pipe Syntax. https://CRAN.R-project.org/package=kableExtra.